# Citizen Forecasts of the 2021 German Election
# Andreas Murr & Mike Lewis-Beck
# Create Table A1
# Prepare data1.RData, data2.RData, data3.RData, data5.RData

# clear working memory

	rm(list=ls())

# load packages

	library(foreign)
	library(weights)

# ====================
# = election results =
# ====================

# load election results data	

	res = read.csv("election-data-germany.csv")

# recode vote shares

	res[c("vote.cdu","vote.spd","vote.fdp","vote.gru","vote.lin")] = res[c("vote.cdu","vote.spd","vote.fdp","vote.gru","vote.lin")] / 100

# ==================
# = politbarometer =
# ==================

# load politbarometer
## V5 - survey month
## V6 - survey year
## Vc10 - expected winner of election
## Vx56 - survey weight

	data = read.dta("ZA5100_v2-0-0.dta", convert.factors=FALSE)
	
# select variables and drop missings

	data = na.omit(data[c("V5", "V6", "Vc10", "Vx56")])
	
# check which surveys are available for each election

	data$date.survey = format(as.Date(as.character(paste(data$V6, ifelse(nchar(data$V5)==1, paste("0", data$V5, sep=""), data$V5), "01")), format="%Y %m %d"), "%Y %m")
	date.election = format(as.Date(res$date, format="%d/%m/%Y"), "%Y %m")
	data$date.election = NA
	data$date.election[(data$date.survey>format(as.Date("03/10/1976", format="%d/%m/%Y"), "%Y %m")) & (data$date.survey<=date.election[1])] = date.election[1]
	J = length(date.election)
	for (j in 2:J){
		data$date.election[(data$date.survey>date.election[j-1]) & (data$date.survey<=date.election[j])] = date.election[j]
	}
	table(data$date.survey, data$date.election)

# compute lead time (1 month means that it is within about 30 days of the election and so on)
	
  y.e = as.numeric(substr(data$date.election, 1, 4))
	m.e = as.numeric(substr(data$date.election, 6, 7))
  y.s = as.numeric(substr(data$date.survey, 1, 4))
	m.s = as.numeric(substr(data$date.survey, 6, 7))
			
	data$lead = NA
	data$lead[y.e - y.s == 0] = (m.e - m.s)[y.e - y.s == 0] + 1
	data$lead[y.e - y.s == 1] = m.e[y.e - y.s == 1] + (12 - m.s[y.e - y.s == 1]) + 1
	data$lead[y.e - y.s == 2] = m.e[y.e - y.s == 2] + 12 + (12 - m.s[y.e - y.s == 2]) + 1

# table A1
	
	table(data$lead, substr(data$date.election, 1, 4))
	median(table(data$lead, substr(data$date.election, 1, 4))[2,])

# code expectations

# 1 = cdu, 2 = spd, 3 = fdp, 4 = gru, 5 = lin, 6 = remainder

# scheme - 2 
# use single party mentions for cdu and spd
# use single party and coalition mentions for fdp, gru, lin

	data$exp.cdu = NA
	data$exp.cdu[data$Vc10%in%c(2:30)] = 0
	data$exp.cdu[data$Vc10%in%c(1)] = 1

	data$exp.spd = NA
	data$exp.spd[data$Vc10%in%c(1:5,7:30)] = 0
	data$exp.spd[data$Vc10%in%c(6)] = 1

	data$exp.fdp = NA
	data$exp.fdp[(data$Vc10%in%c(1,4:7,9:15,17:20,22:30) & data$V6<=1998) | (data$Vc10%in%c(1,3:7,9:15,17:20,22:30) & data$V6>1998)] = 0
	data$exp.fdp[(data$Vc10%in%c(2:3,8,16,21) & data$V6<=1998) | (data$Vc10%in%c(2,8,16,21) & data$V6>1998)] = 1

	data$exp.gru = NA
	data$exp.gru[(data$Vc10%in%c(1:6,8:9,11:14,16:20,23:26,28:30) & data$V6<=1998) | (data$Vc10%in%c(1:2,4:6,8:9,11:14,16:20,23:26,28:30) & data$V6>1998)] = 0
	data$exp.gru[(data$Vc10%in%c(7,10,15,21,22,27) & data$V6<=1998) | (data$Vc10%in%c(3,7,10,15,21,22,27) & data$V6>1998)] = 1

	data$exp.lin = NA
	data$exp.lin[data$Vc10%in%c(1:8,11:16,18:21,23:30)] = 0
	data$exp.lin[data$Vc10%in%c(9,10,17,22)] = 1

# create expectation matrix	for lead times of 1, 2, 3, and 5 months

	election = unique(data$date.election)
	E = length(election)
	exp.mat1 = matrix(0, nrow=length(election), ncol=6)
	exp.mat2 = matrix(0, nrow=length(election), ncol=6)
	exp.mat3 = matrix(0, nrow=length(election), ncol=6)
	exp.mat5 = matrix(0, nrow=length(election), ncol=6)
	for (i in 1:E){
		sel1 = data[data$date.election==election[i] & data$lead==1,]
		sel2 = data[data$date.election==election[i] & data$lead==2,]
		sel3 = data[data$date.election==election[i] & data$lead==3,]
		sel5 = data[data$date.election==election[i] & data$lead==5,]
		# 1 months
		exp.mat1[i,1] = as.numeric(substr(election[i], 1, 4))
		exp.mat1[i,2] = ifelse(sum(sel1$exp.cdu)==0, 0, wpct(sel1$exp.cdu, weight=sel1$Vx56)[2])
		exp.mat1[i,3] = ifelse(sum(sel1$exp.spd)==0, 0, wpct(sel1$exp.spd, weight=sel1$Vx56)[2])
		exp.mat1[i,4] = ifelse(sum(sel1$exp.fdp)==0, 0, wpct(sel1$exp.fdp, weight=sel1$Vx56)[2])
		exp.mat1[i,5] = ifelse(sum(sel1$exp.gru)==0, 0, wpct(sel1$exp.gru, weight=sel1$Vx56)[2])
		exp.mat1[i,6] = ifelse(sum(sel1$exp.lin)==0, 0, wpct(sel1$exp.lin, weight=sel1$Vx56)[2])
		# 2 months
		exp.mat2[i,1] = as.numeric(substr(election[i], 1, 4))
		exp.mat2[i,2] = ifelse(sum(sel2$exp.cdu)==0, 0, wpct(sel2$exp.cdu, weight=sel2$Vx56)[2])
		exp.mat2[i,3] = ifelse(sum(sel2$exp.spd)==0, 0, wpct(sel2$exp.spd, weight=sel2$Vx56)[2])
		exp.mat2[i,4] = ifelse(sum(sel2$exp.fdp)==0, 0, wpct(sel2$exp.fdp, weight=sel2$Vx56)[2])
		exp.mat2[i,5] = ifelse(sum(sel2$exp.gru)==0, 0, wpct(sel2$exp.gru, weight=sel2$Vx56)[2])
		exp.mat2[i,6] = ifelse(sum(sel2$exp.lin)==0, 0, wpct(sel2$exp.lin, weight=sel2$Vx56)[2])
		# 3 months
		exp.mat3[i,1] = as.numeric(substr(election[i], 1, 4))
		exp.mat3[i,2] = ifelse(sum(sel3$exp.cdu)==0, 0, wpct(sel3$exp.cdu, weight=sel3$Vx56)[2])
		exp.mat3[i,3] = ifelse(sum(sel3$exp.spd)==0, 0, wpct(sel3$exp.spd, weight=sel3$Vx56)[2])
		exp.mat3[i,4] = ifelse(sum(sel3$exp.fdp)==0, 0, wpct(sel3$exp.fdp, weight=sel3$Vx56)[2])
		exp.mat3[i,5] = ifelse(sum(sel3$exp.gru)==0, 0, wpct(sel3$exp.gru, weight=sel3$Vx56)[2])
		exp.mat3[i,6] = ifelse(sum(sel3$exp.lin)==0, 0, wpct(sel3$exp.lin, weight=sel3$Vx56)[2])
		# 5 months
		exp.mat5[i,1] = as.numeric(substr(election[i], 1, 4))
		exp.mat5[i,2] = ifelse(sum(sel5$exp.cdu)==0, 0, wpct(sel5$exp.cdu, weight=sel5$Vx56)[2])
		exp.mat5[i,3] = ifelse(sum(sel5$exp.spd)==0, 0, wpct(sel5$exp.spd, weight=sel5$Vx56)[2])
		exp.mat5[i,4] = ifelse(sum(sel5$exp.fdp)==0, 0, wpct(sel5$exp.fdp, weight=sel5$Vx56)[2])
		exp.mat5[i,5] = ifelse(sum(sel5$exp.gru)==0, 0, wpct(sel5$exp.gru, weight=sel5$Vx56)[2])
		exp.mat5[i,6] = ifelse(sum(sel5$exp.lin)==0, 0, wpct(sel5$exp.lin, weight=sel5$Vx56)[2])
	}		
	colnames(exp.mat1) = colnames(exp.mat2) = colnames(exp.mat3) = colnames(exp.mat5) = c("year", paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep="."))
	
	sur1 = data.frame(exp.mat1)
	sur2 = data.frame(exp.mat2)
	sur3 = data.frame(exp.mat3)
	sur5 = data.frame(exp.mat5)

	sur1[apply(sur1[,names(sur1)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")], 1, function(x){sum(x)==0}),names(sur1)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")] = NA
	sur2[apply(sur2[,names(sur2)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")], 1, function(x){sum(x)==0}),names(sur2)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")] = NA
	sur3[apply(sur3[,names(sur3)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")], 1, function(x){sum(x)==0}),names(sur3)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")] = NA
	sur5[apply(sur5[,names(sur5)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")], 1, function(x){sum(x)==0}),names(sur5)%in%paste("exp", c("cdu", "spd", "fdp", "gru", "lin"), sep=".")] = NA

# ==============
# = merge data =
# ==============	

# merge data

	data2 = merge(res, sur2, by="year")
	data1 = merge(res, na.omit(sur1), by="year")
	data3 = merge(res, na.omit(sur3), by="year")
	data5 = merge(res, na.omit(sur5), by="year")

# recode variables
	
	data1$exp.lin[is.na(data1$vote.lin)] = NA
	data2$exp.lin[is.na(data2$vote.lin)] = NA
	data3$exp.lin[is.na(data3$vote.lin)] = NA
	data5$exp.lin[is.na(data5$vote.lin)] = NA

	data1$vote.oth = with(data1, ifelse(is.na(vote.lin), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru + vote.lin)))
	data2$vote.oth = with(data2, ifelse(is.na(vote.lin), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru + vote.lin)))
	data3$vote.oth = with(data3, ifelse(is.na(vote.lin), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru + vote.lin)))
	data5$vote.oth = with(data5, ifelse(is.na(vote.lin), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru), 1 - (vote.cdu + vote.spd + vote.fdp + vote.gru + vote.lin)))

# save data

	data1 = data1[data1$year!=1980,]
	data2 = data2[data2$year!=1980,]
	data3 = data3[data3$year!=1980,]
	data5 = data5[data5$year!=1980,]

	write.csv(data1, file="data1.csv", row.names=F)
	write.csv(data2, file="data2.csv", row.names=F)
	write.csv(data3, file="data3.csv", row.names=F)
	write.csv(data5, file="data5.csv", row.names=F)
	
# ===================
# = end source code =
# ===================	